{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('../data/train_xy.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv('../data/test_all.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 157)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train = train.drop(['cust_group','y','\\ufeffcust_id'],axis =1)\n",
    "x_test = test.drop(['cust_group','cust_id'],axis=1)\n",
    "x_train.shape\n",
    "x_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 157)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = pd.concat([x_train,x_test])\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Y_train = train['y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in range(96,158):\n",
    "    col = 'x'+'_'+str(i)\n",
    "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
    "    x = pd.concat([x, dummies_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15000, 355)\n",
      "(10000, 355)\n"
     ]
    }
   ],
   "source": [
    "train_X = x[0:15000]\n",
    "test_X = x[15000:25000]\n",
    "print(train_X.shape)\n",
    "print(test_X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n",
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn import metrics\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
    "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from keras.optimizers import RMSprop, Adam\n",
    "from keras.callbacks import ReduceLROnPlateau\n",
    "from keras.callbacks import ModelCheckpoint\n",
    "from keras.utils.np_utils import to_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = X_train.values\n",
    "X_val = X_val.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train =  y_train.values\n",
    "yy_train = to_categorical(y_train)\n",
    "\n",
    "y_val =  y_val.values\n",
    "yy_val = to_categorical(y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Set the CNN model \n",
    "# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out\n",
    "\n",
    "model = Sequential()\n",
    "\n",
    "model.add(BatchNormalization(input_shape=(355,)))\n",
    "model.add(Reshape((355,1,1)))\n",
    "\n",
    "\n",
    "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
    "                 activation ='relu'))\n",
    "model.add(Conv2D(filters = 16, kernel_size = 5,padding = 'Same', \n",
    "                 activation ='relu'))\n",
    "model.add(MaxPooling2D(pool_size=2, padding='same'))\n",
    "# model.add(Dropout(0.25))\n",
    "\n",
    "\n",
    "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
    "                 activation ='relu'))\n",
    "model.add(Conv2D(filters = 32, kernel_size = 3,padding = 'Same', \n",
    "                 activation ='relu'))\n",
    "model.add(MaxPooling2D(pool_size=2, strides=2, padding='same'))\n",
    "# model.add(Dropout(0.25))\n",
    "\n",
    "\n",
    "model.add(Flatten())\n",
    "model.add(Dense(256, activation = 'relu'))\n",
    "model.add(Dropout(0.5))\n",
    "model.add(Dense(2, activation = 'softmax'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 10800 samples, validate on 1200 samples\n",
      "Epoch 1/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.2493 - acc: 0.9522 - val_loss: 0.4120 - val_acc: 0.9583\n",
      "Epoch 2/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1865 - acc: 0.9530 - val_loss: 0.2823 - val_acc: 0.9583\n",
      "Epoch 3/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1729 - acc: 0.9530 - val_loss: 0.2359 - val_acc: 0.9583\n",
      "Epoch 4/15\n",
      "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1679 - acc: 0.9530 - val_loss: 0.1838 - val_acc: 0.9583\n",
      "Epoch 5/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1632 - acc: 0.9530 - val_loss: 0.1968 - val_acc: 0.9583\n",
      "Epoch 6/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1657 - acc: 0.9530 - val_loss: 0.1643 - val_acc: 0.9583\n",
      "Epoch 7/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1614 - acc: 0.9530 - val_loss: 0.2133 - val_acc: 0.9583\n",
      "Epoch 8/15\n",
      "10800/10800 [==============================] - 13s 1ms/step - loss: 0.1626 - acc: 0.9530 - val_loss: 0.1540 - val_acc: 0.9583\n",
      "Epoch 9/15\n",
      "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1612 - acc: 0.9530 - val_loss: 0.1574 - val_acc: 0.9583\n",
      "Epoch 10/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1605 - acc: 0.9530 - val_loss: 0.1564 - val_acc: 0.9583\n",
      "Epoch 11/15\n",
      "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1586 - acc: 0.9530 - val_loss: 0.1549 - val_acc: 0.9583\n",
      "Epoch 12/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1585 - acc: 0.9530 - val_loss: 0.1545 - val_acc: 0.9583\n",
      "Epoch 13/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1594 - acc: 0.9530 - val_loss: 0.1565 - val_acc: 0.9583\n",
      "Epoch 14/15\n",
      "10800/10800 [==============================] - 15s 1ms/step - loss: 0.1570 - acc: 0.9530 - val_loss: 0.1589 - val_acc: 0.9583\n",
      "Epoch 15/15\n",
      "10800/10800 [==============================] - 14s 1ms/step - loss: 0.1569 - acc: 0.9529 - val_loss: 0.1572 - val_acc: 0.9583\n"
     ]
    }
   ],
   "source": [
    "history=model.fit(X_train,yy_train, batch_size=256, epochs=15, verbose=1, validation_split=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3000/3000 [==============================] - 1s 452us/step\n",
      "0.764895321667\n"
     ]
    }
   ],
   "source": [
    "predictions = model.predict_proba(X_val,verbose=1)\n",
    "pre = predictions[:,1]\n",
    "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000/10000 [==============================] - 5s 453us/step\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = model.predict_proba(test_X.values)\n",
    "pred = preds[:,1]\n",
    "pred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
    "Submission.to_csv('../result/nn.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
